Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
The historical data for this project is available in file https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
#Load Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
bank_deposit_df = pd.read_csv("bank-full.csv")
bank_deposit_df.head()
bank_deposit_df.tail()
# Check to see the number of records in the dataset
bank_deposit_df.shape
print(bank_deposit_df.columns)
bank_deposit_df.info()
# Converting the data type of the categorical columns
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'Target']
for col in cat_cols:
bank_deposit_df[col] = bank_deposit_df[col].astype('category')
bank_deposit_df.dtypes
# Check to see if data has any missing values
bank_deposit_df.isnull().any()
#Analyze the distribution of the dataset
bank_deposit_df.describe(include = 'all')
bank_deposit_df.describe(include = 'all').transpose()
num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
# Central tendency distribution
i = 1
for col in num_cols:
print('Attribute: ',col)
print('Minimum: ',bank_deposit_df[col].min())
print('Maximum: ',bank_deposit_df[col].max())
print('Mean: ', bank_deposit_df[col].mean())
print('Median: ',bank_deposit_df[col].median())
print('Standard deviation: ', bank_deposit_df[col].std())
Q1 = bank_deposit_df[col].quantile(q=0.25)
Q3 = bank_deposit_df[col].quantile(q=0.75)
print('1st Quartile (Q1) is: ', Q1)
print('3st Quartile (Q3) is: ', Q3)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers: ', L_outliers)
print('Upper outliers: ', U_outliers)
plt.figure(figsize=(10,8))
sns.boxplot(bank_deposit_df[col])
plt.show();
i = i+1
print('-------------------------------------------------------------------------')
# Skewness distribution
i = 1
for col in num_cols:
plt.figure(figsize=(10,8))
sns.distplot(bank_deposit_df[col])
i = i+1
# Check the unique values in each column of the dataframe
bank_deposit_df.nunique()
import pandas_profiling
bank_deposit_df.profile_report()
from pandas_profiling import ProfileReport
profile = ProfileReport(bank_deposit_df, title="Pandas Profiling Report")
profile.to_widgets()
bank_deposit_df.Target.nunique()
bank_deposit_df.Target.unique()
bank_deposit_df["Target"].value_counts()
# Plot the distribution of the target attribute
plt.figure(figsize=(8,6))
sns.countplot(bank_deposit_df['Target'],data=bank_deposit_df);
plt.show()
#importing the Encoding library
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
bank_deposit_df['job']=labelencoder_X.fit_transform(bank_deposit_df['job'])
bank_deposit_df['marital']=labelencoder_X.fit_transform(bank_deposit_df['marital'])
bank_deposit_df['education']=labelencoder_X.fit_transform(bank_deposit_df['education'])
bank_deposit_df['default']=labelencoder_X.fit_transform(bank_deposit_df['default'])
bank_deposit_df['housing']=labelencoder_X.fit_transform(bank_deposit_df['housing'])
bank_deposit_df['loan']=labelencoder_X.fit_transform(bank_deposit_df['loan'])
bank_deposit_df['contact']= labelencoder_X.fit_transform(bank_deposit_df['contact'])
bank_deposit_df['month']= labelencoder_X.fit_transform(bank_deposit_df['month'])
bank_deposit_df['poutcome'] = labelencoder_X.fit_transform(bank_deposit_df['poutcome'])
bank_deposit_df['Target'] = labelencoder_X.fit_transform(bank_deposit_df['Target'])
bank_deposit_df.head(10)
bank_deposit_df.describe().T
cat_cols.pop()
print(cat_cols)
i = 1
for col in cat_cols:
plt.figure(figsize=(18,12))
print(pd.crosstab(bank_deposit_df[col],bank_deposit_df['Target'],normalize='index'))
sns.countplot(x=col, data=bank_deposit_df,hue="Target");
plt.show()
plt.tight_layout()
i = i+1
print('------------------------------------------------------------------------------------------------')
pd.crosstab(bank_deposit_df['default'],bank_deposit_df['Target'],normalize='index')
sns.countplot(x=bank_deposit_df['default'], data=bank_deposit_df,hue="Target");
pd.crosstab(bank_deposit_df['housing'],bank_deposit_df['Target'],normalize='index')
sns.countplot(x=bank_deposit_df['housing'], data=bank_deposit_df,hue="Target");
pd.crosstab(bank_deposit_df['loan'],bank_deposit_df['Target'],normalize='index')
sns.countplot(x=bank_deposit_df['loan'], data=bank_deposit_df,hue="Target");
# Correlation matrix
corr = bank_deposit_df.corr()
corr
# Heatmap
sns.set(font_scale=1.15)
fig,ax=plt.subplots(figsize=(16,16))
sns.heatmap(corr, cmap='GnBu',annot=True,linewidths=0.01,center=0,linecolor='white',square=True)
plt.title('Correlation between attributes',fontsize=18)
ax.tick_params(labelsize=16)
sns.pairplot(data = bank_deposit_df,hue='Target',diag_kind ='kde');
## Define X and Y variables
X = bank_deposit_df.drop('Target', axis=1)
Y = bank_deposit_df[['Target']]
#Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)
X
## Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,random_state=100,stratify=Y)
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=100)
logreg.fit(X_train, y_train) # fit the model on train data
y_predict = logreg.predict(X_test)
y_predict
cmatrix = confusion_matrix(y_test, y_predict)
print(cmatrix)
class_label = ["Positive", "Negative"]
bank_cm = pd.DataFrame(cmatrix, index = class_label, columns = class_label)
sns.heatmap(bank_cm, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Observed")
plt.show()
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))
#Import the metrics
from sklearn import metrics
acc_logreg = metrics.accuracy_score(y_test, y_predict)
print('Accuracy - Logistic Regression : ',acc_logreg)
print("Training accuracy",logreg.score(X_train,y_train))
print()
print("Testing accuracy",logreg.score(X_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict,pos_label=1))
print()
print("Precision:",precision_score(y_test,y_predict,pos_label=1))
print()
print("F1 Score:",f1_score(y_test,y_predict,pos_label=1))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
results_df = pd.DataFrame({'Method':['Logistic Regression'],'Accuracy': acc_logreg.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df
# Invoking the decision tree classifier function, using 'entropy' method of finding the split columns.
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='entropy',random_state=100)
dtree.fit(X_train, y_train)
dtree.score(X_train, y_train)
dtree.score(X_test, y_test)
#Making the prediction
y_predict = dtree.predict(X_test)
# Evaluate the model using accuracy, confusion metrix, and classification report
acc_DT = metrics.accuracy_score(y_test, y_predict)
print('Accuracy DT: ',acc_DT)
print()
print('Confusion Matrix DT: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report DT: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Decision Tree'], 'Accuracy': acc_DT.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
bank_df = bank_deposit_df.copy()
feature_cols = X.columns
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('bank_data.png')
Image(graph.create_png())
print (pd.DataFrame(dtree.feature_importances_, columns = ["Imp"], index = X_train.columns))
# Regularizing the Decision tree classifier and fitting the model
reg_dtree = DecisionTreeClassifier(criterion = 'entropy', random_state = 100,max_depth = 7,min_samples_leaf=5)
reg_dtree.fit(X_train, y_train)
y_predict = reg_dtree.predict(X_test)
# performance on train data
print('Performance on Training data using Pruned DT: ',reg_dtree.score(X_train , y_train))
# performance on test data
print('Performance on Testing data using Pruned DT: ',reg_dtree.score(X_test , y_test))
# Evaluate the model using accuracy, confusion metrix, and classification report
acc_pruned_DT=metrics.accuracy_score(y_test, y_predict)
print("Accuracy Pruned DT:",acc_pruned_DT)
print()
print('Confusion Matrix Pruned DT: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report Pruned DT: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Decision Tree with Pruning'], 'Accuracy': acc_pruned_DT.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
dot_data = StringIO()
export_graphviz(reg_dtree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['1','0'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('bank_data_pruned.png')
Image(graph.create_png())
from sklearn.ensemble import RandomForestClassifier
rforest = RandomForestClassifier(n_estimators = 50,random_state=100)
rforest = rforest.fit(X_train, y_train)
y_predict = rforest.predict(X_test)
acc_RF = metrics.accuracy_score(y_test , y_predict)
print('Accuracy using Random forest: ',acc_RF)
print()
print('Confusion Matrix for Random Forest: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for Random Forest: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest'], 'Accuracy': acc_RF.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
reg_rforest = RandomForestClassifier(criterion = 'entropy', random_state = 100,max_depth = 7,min_samples_leaf=5)
reg_rforest = reg_rforest.fit(X_train, y_train)
y_predict = reg_rforest.predict(X_test)
acc_pruned_RF = metrics.accuracy_score(y_test , y_predict)
print('Accuracy using Random forest pruned: ',acc_pruned_RF)
print()
print('Confusion Matrix Random Forest pruned: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for Random Forest pruned: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest with Pruning'], 'Accuracy': acc_pruned_RF.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier(base_estimator=dtree,n_estimators=50,random_state=100)
bag = bag.fit(X_train, y_train)
y_predict = bag.predict(X_test)
acc_BG = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using Bagging: ',acc_BG)
print()
print('Confusion Matrix using Bagging: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for Bagging: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Bagging'], 'Accuracy': acc_BG.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.ensemble import AdaBoostClassifier
aboost = AdaBoostClassifier(base_estimator=dtree, n_estimators=50,random_state=100)
aboost = aboost.fit(X_train, y_train)
y_predict = aboost.predict(X_test)
acc_AB = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using AdaBoost: ',acc_AB)
print()
print('Confusion Matrix using AdaBoost: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for AdaBoost: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Ada Boost'], 'Accuracy': acc_AB.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.ensemble import GradientBoostingClassifier
gboost = GradientBoostingClassifier(n_estimators=50,random_state=100)
gboost = gboost.fit(X_train, y_train)
y_predict = gboost.predict(X_test)
acc_GB = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using GradientBoost: ',acc_GB)
print()
print('Confusion Matrix using GradientBoost: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for GradientBoost: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Gradient Boost'], 'Accuracy': acc_GB.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
import xgboost as xgb
from xgboost import XGBClassifier
xgboost = XGBClassifier(n_estimators=50,random_state=100)
xgboost.fit(X_train, y_train)
y_predict = xgboost.predict(X_test)
acc_XGB = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using XGBoost: ',acc_XGB)
print()
print('Confusion Matrix using XGBoost: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for XGBoost: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['XG Boost'], 'Accuracy': acc_XGB.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
import lightgbm
from lightgbm import LGBMClassifier
lt_gbm = LGBMClassifier(n_estimators=50,random_state=100)
lt_gbm.fit(X_train, y_train)
y_predict = lt_gbm.predict(X_test)
acc_LGB = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using LightGBM: ',acc_LGB)
print()
print('Confusion Matrix using LightGBM: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for LightGBM: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Light GBM'], 'Accuracy': acc_LGB.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
bank_new_df = bank_deposit_df.copy()
X = bank_new_df.drop('Target', axis=1)
Y = bank_new_df[['Target']]
#Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)
X
## Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,random_state=100,stratify=Y)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# summarize class distribution
bank_new_df["Target"].value_counts()
print(X_train.shape)
print(y_train.shape)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
# transform the column
oversample = SMOTE()
undersample = RandomUnderSampler()
steps = [('over', oversample), ('under', undersample)]
pipeline = Pipeline(steps=steps)
X_train, y_train = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
print(X_train.shape)
print(y_train.shape)
logreg_new = LogisticRegression(random_state=100)
logreg_new.fit(X_train, y_train) # fit the model on train data
y_predict = logreg_new.predict(X_test)
y_predict
cmatrix = confusion_matrix(y_test, y_predict)
print(cmatrix)
class_label = ["Positive", "Negative"]
bank_cm = pd.DataFrame(cmatrix, index = class_label, columns = class_label)
sns.heatmap(bank_cm, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Observed")
plt.show()
# Calculate classification report
from sklearn.metrics import classification_report
acc_logreg_new = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using Logistic Regression: ',acc_logreg_new)
print(classification_report(y_test, y_predict))
print("Training accuracy",logreg_new.score(X_train,y_train))
print()
print("Testing accuracy",logreg_new.score(X_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict,pos_label=1))
print()
print("Precision:",precision_score(y_test,y_predict,pos_label=1))
print()
print("F1 Score:",f1_score(y_test,y_predict,pos_label=1))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Logistic Regression with Oversampling'], 'Accuracy': acc_logreg_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = [
{'penalty': ['l1'], 'solver': [ 'liblinear','saga']},
{'penalty': ['l2','none'], 'solver': ['newton-cg','lbfgs','sag','saga']},
{'C': [1000, 100, 10, 1.0, 0.1, 0.01, 0.001]},
{'class_weight': [None,'balanced']}
]
# Create grid search using 5-fold cross validation
clf = GridSearchCV(logreg_new, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best hyperparameters
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best Hyper Parameters are: ', best_model.best_params_)
# Predict target vector
best_model.predict(X)
acc_logreg_hyper = metrics.accuracy_score(y_test, y_predict)
print('Accuracy - Logistic Regression with Oversampling & HyperTuning : ',acc_logreg_hyper)
#Therefore final model is
y_predict = best_model.predict(X_test)
print("Training accuracy",best_model.score(X_train,y_train))
print()
print("Testing accuracy",best_model.score(X_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict,pos_label=1))
print()
print("Precision:",precision_score(y_test,y_predict,pos_label=1))
print()
print("F1 Score:",f1_score(y_test,y_predict,pos_label=1))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
conf_matrix = confusion_matrix(y_test, y_predict)
print(conf_matrix)
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Logistic Regression with Oversampling & HyperTuning'], 'Accuracy': acc_logreg_hyper.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
# Invoking the decision tree classifier function, using 'entropy' method of finding the split columns.
dtree_new = DecisionTreeClassifier(criterion='entropy',random_state=100)
dtree_new.fit(X_train, y_train)
dtree_new.score(X_train, y_train)
dtree_new.score(X_test, y_test)
#Making the prediction
y_predict = dtree_new.predict(X_test)
# Evaluate the model using accuracy, confusion metrix, and classification report
acc_DT_new = metrics.accuracy_score(y_test, y_predict)
print('Accuracy DT: ',acc_DT_new)
print()
print('Confusion Matrix DT: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report DT: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Decision Tree with Oversampling'], 'Accuracy': acc_DT_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
# Regularizing the Decision tree classifier and fitting the model
reg_dtree_new = DecisionTreeClassifier(criterion = 'entropy', random_state = 100,max_depth = 7,min_samples_leaf=5)
reg_dtree_new.fit(X_train, y_train)
y_predict = reg_dtree_new.predict(X_test)
# performance on train data
print('Performance on Training data using Pruned DT: ',reg_dtree_new.score(X_train , y_train))
# performance on test data
print('Performance on Testing data using Pruned DT: ',reg_dtree_new.score(X_test , y_test))
# Evaluate the model using accuracy, confusion metrix, and classification report
acc_pruned_DT_new = metrics.accuracy_score(y_test, y_predict)
print("Accuracy Pruned DT:",acc_pruned_DT_new)
print()
print('Confusion Matrix Pruned DT: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report Pruned DT: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Decision Tree with Pruning & Oversampling'], 'Accuracy': acc_pruned_DT_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
# Create the GridSearch estimator along with a parameter object containing the values to adjust
criterion = ['gini', 'entropy']
max_depth = [2,4,6,7,8,10]
sample_split_range = list(range(1, 50))
param_grid = [
{'criterion': criterion},
{'max_depth': max_depth},
{'min_samples_split': sample_split_range}
]
# Create grid search using 5-fold cross validation
clf = GridSearchCV(dtree_new, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best hyperparameters
print(best_model.best_score_)
print(best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(best_model.best_estimator_)
# Predict target vector
best_model.predict(X)
# Therefore final model is
y_predict = best_model.predict(X_test)
acc_DT_tu = metrics.accuracy_score(y_test , y_predict)
print("Training accuracy",best_model.score(X_train,y_train))
print()
print("Testing accuracy",best_model.score(X_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict,pos_label=1))
print()
print("Precision:",precision_score(y_test,y_predict,pos_label=1))
print()
print("F1 Score:",f1_score(y_test,y_predict,pos_label=1))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
conf_matrix = confusion_matrix(y_test, y_predict)
print(conf_matrix)
print('Classification report for Decision Tree with Oversampling & HyperTuning: \n',metrics.classification_report(y_test, y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Decision Tree with Oversampling & HyperTuning'], 'Accuracy': acc_DT_tu.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
dot_data = StringIO()
export_graphviz(dtree_new, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('bank_data_tuned.png')
Image(graph.create_png())
rforest_new = RandomForestClassifier(n_estimators = 50,random_state=100)
rforest_new = rforest_new.fit(X_train, y_train)
y_predict = rforest_new.predict(X_test)
acc_RF_new = metrics.accuracy_score(y_test , y_predict)
print('Accuracy using Random forest: ',acc_RF_new)
print()
print('Confusion Matrix for Random Forest: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for Random Forest: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest with Oversampling'], 'Accuracy': acc_RF_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
reg_rforest_new = RandomForestClassifier(criterion = 'entropy', random_state = 100,max_depth = 7,min_samples_leaf=5)
reg_rforest_new = reg_rforest_new.fit(X_train, y_train)
y_predict = reg_rforest_new.predict(X_test)
acc_pruned_RF_new = metrics.accuracy_score(y_test , y_predict)
print('Accuracy using Random forest pruned: ',acc_pruned_RF_new)
print()
print('Confusion Matrix Random Forest pruned: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for Random Forest pruned: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest with Pruning & Oversampling'], 'Accuracy': acc_pruned_RF_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
# Create the GridSearch estimator along with a parameter object containing the values to adjust
n_estimators = [100, 200, 300, 500]
max_features = ['sqrt']
max_depth = [5,6,7,8]
min_samples_split = [2, 5, 10,15]
min_samples_leaf = [1, 2, 5, 10]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth,
min_samples_split = min_samples_split, max_features = max_features,
min_samples_leaf = min_samples_leaf)
# Create grid search using 5-fold cross validation
clf = GridSearchCV(rforest_new, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best hyperparameters
print(best_model.best_score_)
print(best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(best_model.best_estimator_)
# Predict target vector
best_model.predict(X)
# Therefore final model is
y_predict = best_model.predict(X_test)
acc_RF_tu = metrics.accuracy_score(y_test , y_predict)
print("Training accuracy",best_model.score(X_train,y_train))
print()
print("Testing accuracy",best_model.score(X_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict,pos_label=1))
print()
print("Precision:",precision_score(y_test,y_predict,pos_label=1))
print()
print("F1 Score:",f1_score(y_test,y_predict,pos_label=1))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
conf_matrix = confusion_matrix(y_test, y_predict)
print(conf_matrix)
print('Classification report for Random Forest with Oversampling & HyperTuning: \n',metrics.classification_report(y_test, y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest with Oversampling & HyperTuning'], 'Accuracy': acc_RF_tu.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
bag_new = BaggingClassifier(base_estimator=dtree_new,n_estimators=50,random_state=100)
bag_new = bag_new.fit(X_train, y_train)
y_predict = bag_new.predict(X_test)
acc_BG_new = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using Bagging: ',acc_BG_new)
print()
print('Confusion Matrix using Bagging: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for Bagging: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Bagging with Oversampling'], 'Accuracy': acc_BG_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
aboost_new = AdaBoostClassifier(base_estimator=dtree_new, n_estimators=50,random_state=100)
aboost_new = aboost_new.fit(X_train, y_train)
y_predict = aboost_new.predict(X_test)
acc_AB_new = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using AdaBoost: ',acc_AB_new)
print()
print('Confusion Matrix using AdaBoost: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for AdaBoost: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Ada Boost with Oversampling'], 'Accuracy': acc_AB_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
# Create the GridSearch estimator along with a parameter object containing the values to adjust
n_estimators = [10,100, 200, 300, 500]
learning_rate = [0.001,0.05,0.01,0.1]
param_grid = dict(n_estimators = n_estimators, learning_rate = learning_rate)
# Create grid search using 5-fold cross validation
clf = GridSearchCV(aboost_new, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best hyperparameters
print(best_model.best_score_)
print(best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(best_model.best_estimator_)
# Predict target vector
best_model.predict(X)
# Therefore final model is
y_predict = best_model.predict(X_test)
acc_AB_tu = metrics.accuracy_score(y_test , y_predict)
print("Training accuracy",best_model.score(X_train,y_train))
print()
print("Testing accuracy",best_model.score(X_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict,pos_label=1))
print()
print("Precision:",precision_score(y_test,y_predict,pos_label=1))
print()
print("F1 Score:",f1_score(y_test,y_predict,pos_label=1))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
conf_matrix = confusion_matrix(y_test, y_predict)
print(conf_matrix)
print('Classification report for Adaboost with Oversampling & HyperTuning: \n',metrics.classification_report(y_test, y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Adaboost with Oversampling & HyperTuning'], 'Accuracy': acc_AB_tu.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
gboost_new = GradientBoostingClassifier(n_estimators=50,random_state=100)
gboost_new = gboost_new.fit(X_train, y_train)
y_predict = gboost_new.predict(X_test)
acc_GB_new = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using GradientBoost: ',acc_GB_new)
print()
print('Confusion Matrix using GradientBoost: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for GradientBoost: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Gradient Boost with Oversampling'], 'Accuracy': acc_GB_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
# Create the GridSearch estimator along with a parameter object containing the values to adjust
n_estimators = [100,200, 300, 400, 500]
learning_rate = [0.001,0.05,0.01,0.1]
max_depth = [5,6,8,10]
subsample = [0.5, 0.6, 0.8, 0.9, 1.0]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth,learning_rate = learning_rate,
subsample = subsample, max_features = max_features)
# Create grid search using 5-fold cross validation
clf = GridSearchCV(gboost_new, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best hyperparameters
print(best_model.best_score_)
print(best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(best_model.best_estimator_)
# Predict target vector
best_model.predict(X)
# Therefore final model is
y_predict = best_model.predict(X_test)
acc_RF_tu = metrics.accuracy_score(y_test , y_predict)
print("Training accuracy",best_model.score(X_train,y_train))
print()
print("Testing accuracy",best_model.score(X_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict,pos_label=1))
print()
print("Precision:",precision_score(y_test,y_predict,pos_label=1))
print()
print("F1 Score:",f1_score(y_test,y_predict,pos_label=1))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
conf_matrix = confusion_matrix(y_test, y_predict)
print(conf_matrix)
print('Classification report for Gradient Boost with Oversampling & HyperTuning: \n',metrics.classification_report(y_test, y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Gradient Boost with Oversampling & HyperTuning'], 'Accuracy': acc_RF_tu.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
xgboost_new = XGBClassifier(n_estimators=50,random_state=100)
xgboost_new.fit(X_train, y_train)
y_predict = xgboost_new.predict(X_test)
acc_XGB_new = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using XGBoost: ',acc_XGB_new)
print()
print('Confusion Matrix using XGBoost: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for XGBoost: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['XG Boost with Oversampling'], 'Accuracy': acc_XGB_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
lt_gbm_new = LGBMClassifier(n_estimators=50,random_state=100)
lt_gbm_new.fit(X_train, y_train)
y_predict = lt_gbm_new.predict(X_test)
acc_LGB_new = metrics.accuracy_score(y_test, y_predict)
print('Accuracy using LightGBM: ',acc_LGB_new)
print()
print('Confusion Matrix using LightGBM: \n',metrics.confusion_matrix(y_test, y_predict))
print()
print('Classification report for LightGBM: \n',metrics.classification_report(y_test, y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Light GBM with Oversampling'], 'Accuracy': acc_LGB_new.round(3),'Recall': recall_score(y_test,y_predict,pos_label=1).round(3),'Precision': precision_score(y_test,y_predict,pos_label=1).round(3), 'F1 Score': f1_score(y_test,y_predict,pos_label=1).round(3),'Roc Auc Score': roc_auc_score(y_test,y_predict).round(3)})
results_df = results_df.append(r_df)
results_df
The goal of this project is to build a machine learning model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit using the data collected from existing customers. As demonstrated above, a variety of classifiers(algorithms) were used and their metrics compared both before and after addressing the imbalance in the target variable.
The Receiver Operator Characteristic (ROC) curve is an evaluation metric for binary classification problems. It is a probability curve that plots the TPR(True Positive Rate) against FPR(False Positive Rate) at various threshold values and essentially separates the ‘signal’ from the ‘noise’. The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes and is used as a summary of the ROC curve.
The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes.Both time and money can be saved by knowing the characteristics of clients to market to and that would lead to increased growth and revenue. SO the AUC score calculated above AUC is chosen as performance metric for picking the best model since it captures the trade off between the true positive and false positive.
From the above dataframe, the models Gradient Boost and Random Forest with optimized hyperparameters have higher AOC scores and thus did a better job of classifying the positive class in the dataset. Gradient Boost model was able to catch 77% (Recall) of customers that will subscribe to a term deposit while Random Forest with HyperTuning was able to catch 79% (Recall) of customers that will subscribe to a term deposit.